IMPORTING LIBRARIES TO PERFORM EDA¶
InĀ [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
from matplotlib import cm
InĀ [2]:
df1 = pd.read_csv("C:/Users/Ramasish Chatterjee/Downloads/airline_passenger_satisfaction.csv") ##to read the csv file saved
df1
Out[2]:
| Unnamed: 0 | Gender | customer_type | age | type_of_travel | customer_class | flight_distance | inflight_wifi_service | departure_arrival_time_convenient | ease_of_online_booking | ... | inflight_entertainment | onboard_service | leg_room_service | baggage_handling | checkin_service | inflight_service | cleanliness | departure_delay_in_minutes | arrival_delay_in_minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Male | Loyal Customer | 13 | Personal Travel | Eco Plus | 460 | 3 | 4 | 3 | ... | 5 | 4 | 3 | 4 | 4 | 5 | 5 | 25 | 18.0 | neutral or dissatisfied |
| 1 | 1 | Male | disloyal Customer | 25 | Business travel | Business | 235 | 3 | 2 | 3 | ... | 1 | 1 | 5 | 3 | 1 | 4 | 1 | 1 | 6.0 | neutral or dissatisfied |
| 2 | 2 | Female | Loyal Customer | 26 | Business travel | Business | 1142 | 2 | 2 | 2 | ... | 5 | 4 | 3 | 4 | 4 | 4 | 5 | 0 | 0.0 | satisfied |
| 3 | 3 | Female | Loyal Customer | 25 | Business travel | Business | 562 | 2 | 5 | 5 | ... | 2 | 2 | 5 | 3 | 1 | 4 | 2 | 11 | 9.0 | neutral or dissatisfied |
| 4 | 4 | Male | Loyal Customer | 61 | Business travel | Business | 214 | 3 | 3 | 3 | ... | 3 | 3 | 4 | 4 | 3 | 3 | 3 | 0 | 0.0 | satisfied |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 129875 | 129875 | Male | disloyal Customer | 34 | Business travel | Business | 526 | 3 | 3 | 3 | ... | 4 | 3 | 2 | 4 | 4 | 5 | 4 | 0 | 0.0 | neutral or dissatisfied |
| 129876 | 129876 | Male | Loyal Customer | 23 | Business travel | Business | 646 | 4 | 4 | 4 | ... | 4 | 4 | 5 | 5 | 5 | 5 | 4 | 0 | 0.0 | satisfied |
| 129877 | 129877 | Female | Loyal Customer | 17 | Personal Travel | Eco | 828 | 2 | 5 | 1 | ... | 2 | 4 | 3 | 4 | 5 | 4 | 2 | 0 | 0.0 | neutral or dissatisfied |
| 129878 | 129878 | Male | Loyal Customer | 14 | Business travel | Business | 1127 | 3 | 3 | 3 | ... | 4 | 3 | 2 | 5 | 4 | 5 | 4 | 0 | 0.0 | satisfied |
| 129879 | 129879 | Female | Loyal Customer | 42 | Personal Travel | Eco | 264 | 2 | 5 | 2 | ... | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 0 | 0.0 | neutral or dissatisfied |
129880 rows Ć 24 columns
InĀ [3]:
df=df1
InĀ [4]:
df.head() #to get top 06 values
Out[4]:
| Unnamed: 0 | Gender | customer_type | age | type_of_travel | customer_class | flight_distance | inflight_wifi_service | departure_arrival_time_convenient | ease_of_online_booking | ... | inflight_entertainment | onboard_service | leg_room_service | baggage_handling | checkin_service | inflight_service | cleanliness | departure_delay_in_minutes | arrival_delay_in_minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Male | Loyal Customer | 13 | Personal Travel | Eco Plus | 460 | 3 | 4 | 3 | ... | 5 | 4 | 3 | 4 | 4 | 5 | 5 | 25 | 18.0 | neutral or dissatisfied |
| 1 | 1 | Male | disloyal Customer | 25 | Business travel | Business | 235 | 3 | 2 | 3 | ... | 1 | 1 | 5 | 3 | 1 | 4 | 1 | 1 | 6.0 | neutral or dissatisfied |
| 2 | 2 | Female | Loyal Customer | 26 | Business travel | Business | 1142 | 2 | 2 | 2 | ... | 5 | 4 | 3 | 4 | 4 | 4 | 5 | 0 | 0.0 | satisfied |
| 3 | 3 | Female | Loyal Customer | 25 | Business travel | Business | 562 | 2 | 5 | 5 | ... | 2 | 2 | 5 | 3 | 1 | 4 | 2 | 11 | 9.0 | neutral or dissatisfied |
| 4 | 4 | Male | Loyal Customer | 61 | Business travel | Business | 214 | 3 | 3 | 3 | ... | 3 | 3 | 4 | 4 | 3 | 3 | 3 | 0 | 0.0 | satisfied |
5 rows Ć 24 columns
InĀ [5]:
df.tail()
Out[5]:
| Unnamed: 0 | Gender | customer_type | age | type_of_travel | customer_class | flight_distance | inflight_wifi_service | departure_arrival_time_convenient | ease_of_online_booking | ... | inflight_entertainment | onboard_service | leg_room_service | baggage_handling | checkin_service | inflight_service | cleanliness | departure_delay_in_minutes | arrival_delay_in_minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 129875 | 129875 | Male | disloyal Customer | 34 | Business travel | Business | 526 | 3 | 3 | 3 | ... | 4 | 3 | 2 | 4 | 4 | 5 | 4 | 0 | 0.0 | neutral or dissatisfied |
| 129876 | 129876 | Male | Loyal Customer | 23 | Business travel | Business | 646 | 4 | 4 | 4 | ... | 4 | 4 | 5 | 5 | 5 | 5 | 4 | 0 | 0.0 | satisfied |
| 129877 | 129877 | Female | Loyal Customer | 17 | Personal Travel | Eco | 828 | 2 | 5 | 1 | ... | 2 | 4 | 3 | 4 | 5 | 4 | 2 | 0 | 0.0 | neutral or dissatisfied |
| 129878 | 129878 | Male | Loyal Customer | 14 | Business travel | Business | 1127 | 3 | 3 | 3 | ... | 4 | 3 | 2 | 5 | 4 | 5 | 4 | 0 | 0.0 | satisfied |
| 129879 | 129879 | Female | Loyal Customer | 42 | Personal Travel | Eco | 264 | 2 | 5 | 2 | ... | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 0 | 0.0 | neutral or dissatisfied |
5 rows Ć 24 columns
InĀ [6]:
df = df.drop(columns=['Unnamed: 0'], axis=1)
df
Out[6]:
| Gender | customer_type | age | type_of_travel | customer_class | flight_distance | inflight_wifi_service | departure_arrival_time_convenient | ease_of_online_booking | gate_location | ... | inflight_entertainment | onboard_service | leg_room_service | baggage_handling | checkin_service | inflight_service | cleanliness | departure_delay_in_minutes | arrival_delay_in_minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Male | Loyal Customer | 13 | Personal Travel | Eco Plus | 460 | 3 | 4 | 3 | 1 | ... | 5 | 4 | 3 | 4 | 4 | 5 | 5 | 25 | 18.0 | neutral or dissatisfied |
| 1 | Male | disloyal Customer | 25 | Business travel | Business | 235 | 3 | 2 | 3 | 3 | ... | 1 | 1 | 5 | 3 | 1 | 4 | 1 | 1 | 6.0 | neutral or dissatisfied |
| 2 | Female | Loyal Customer | 26 | Business travel | Business | 1142 | 2 | 2 | 2 | 2 | ... | 5 | 4 | 3 | 4 | 4 | 4 | 5 | 0 | 0.0 | satisfied |
| 3 | Female | Loyal Customer | 25 | Business travel | Business | 562 | 2 | 5 | 5 | 5 | ... | 2 | 2 | 5 | 3 | 1 | 4 | 2 | 11 | 9.0 | neutral or dissatisfied |
| 4 | Male | Loyal Customer | 61 | Business travel | Business | 214 | 3 | 3 | 3 | 3 | ... | 3 | 3 | 4 | 4 | 3 | 3 | 3 | 0 | 0.0 | satisfied |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 129875 | Male | disloyal Customer | 34 | Business travel | Business | 526 | 3 | 3 | 3 | 1 | ... | 4 | 3 | 2 | 4 | 4 | 5 | 4 | 0 | 0.0 | neutral or dissatisfied |
| 129876 | Male | Loyal Customer | 23 | Business travel | Business | 646 | 4 | 4 | 4 | 4 | ... | 4 | 4 | 5 | 5 | 5 | 5 | 4 | 0 | 0.0 | satisfied |
| 129877 | Female | Loyal Customer | 17 | Personal Travel | Eco | 828 | 2 | 5 | 1 | 5 | ... | 2 | 4 | 3 | 4 | 5 | 4 | 2 | 0 | 0.0 | neutral or dissatisfied |
| 129878 | Male | Loyal Customer | 14 | Business travel | Business | 1127 | 3 | 3 | 3 | 3 | ... | 4 | 3 | 2 | 5 | 4 | 5 | 4 | 0 | 0.0 | satisfied |
| 129879 | Female | Loyal Customer | 42 | Personal Travel | Eco | 264 | 2 | 5 | 2 | 5 | ... | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 0 | 0.0 | neutral or dissatisfied |
129880 rows Ć 23 columns
InĀ [7]:
df.shape
Out[7]:
(129880, 23)
InĀ [8]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 129880 entries, 0 to 129879 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 129880 non-null object 1 customer_type 129880 non-null object 2 age 129880 non-null int64 3 type_of_travel 129880 non-null object 4 customer_class 129880 non-null object 5 flight_distance 129880 non-null int64 6 inflight_wifi_service 129880 non-null int64 7 departure_arrival_time_convenient 129880 non-null int64 8 ease_of_online_booking 129880 non-null int64 9 gate_location 129880 non-null int64 10 food_and_drink 129880 non-null int64 11 online_boarding 129880 non-null int64 12 seat_comfort 129880 non-null int64 13 inflight_entertainment 129880 non-null int64 14 onboard_service 129880 non-null int64 15 leg_room_service 129880 non-null int64 16 baggage_handling 129880 non-null int64 17 checkin_service 129880 non-null int64 18 inflight_service 129880 non-null int64 19 cleanliness 129880 non-null int64 20 departure_delay_in_minutes 129880 non-null int64 21 arrival_delay_in_minutes 129487 non-null float64 22 satisfaction 129880 non-null object dtypes: float64(1), int64(17), object(5) memory usage: 22.8+ MB
DATA CLEANING¶
InĀ [9]:
df.describe(include='all')
Out[9]:
| Gender | customer_type | age | type_of_travel | customer_class | flight_distance | inflight_wifi_service | departure_arrival_time_convenient | ease_of_online_booking | gate_location | ... | inflight_entertainment | onboard_service | leg_room_service | baggage_handling | checkin_service | inflight_service | cleanliness | departure_delay_in_minutes | arrival_delay_in_minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 129880 | 129880 | 129880.000000 | 129880 | 129880 | 129880.000000 | 129880.000000 | 129880.000000 | 129880.000000 | 129880.000000 | ... | 129880.000000 | 129880.000000 | 129880.000000 | 129880.000000 | 129880.000000 | 129880.000000 | 129880.000000 | 129880.000000 | 129487.000000 | 129880 |
| unique | 2 | 2 | NaN | 2 | 3 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 |
| top | Female | Loyal Customer | NaN | Business travel | Business | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | neutral or dissatisfied |
| freq | 65899 | 106100 | NaN | 89693 | 62160 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 73452 |
| mean | NaN | NaN | 39.427957 | NaN | NaN | 1190.316392 | 2.728696 | 3.057599 | 2.756876 | 2.976925 | ... | 3.358077 | 3.383023 | 3.350878 | 3.632114 | 3.306267 | 3.642193 | 3.286326 | 14.713713 | 15.091129 | NaN |
| std | NaN | NaN | 15.119360 | NaN | NaN | 997.452477 | 1.329340 | 1.526741 | 1.401740 | 1.278520 | ... | 1.334049 | 1.287099 | 1.316252 | 1.180025 | 1.266185 | 1.176669 | 1.313682 | 38.071126 | 38.465650 | NaN |
| min | NaN | NaN | 7.000000 | NaN | NaN | 31.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN |
| 25% | NaN | NaN | 27.000000 | NaN | NaN | 414.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | ... | 2.000000 | 2.000000 | 2.000000 | 3.000000 | 3.000000 | 3.000000 | 2.000000 | 0.000000 | 0.000000 | NaN |
| 50% | NaN | NaN | 40.000000 | NaN | NaN | 844.000000 | 3.000000 | 3.000000 | 3.000000 | 3.000000 | ... | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 3.000000 | 4.000000 | 3.000000 | 0.000000 | 0.000000 | NaN |
| 75% | NaN | NaN | 51.000000 | NaN | NaN | 1744.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | ... | 4.000000 | 4.000000 | 4.000000 | 5.000000 | 4.000000 | 5.000000 | 4.000000 | 12.000000 | 13.000000 | NaN |
| max | NaN | NaN | 85.000000 | NaN | NaN | 4983.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | ... | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 1592.000000 | 1584.000000 | NaN |
11 rows Ć 23 columns
InĀ [10]:
df.duplicated().any()
Out[10]:
np.False_
InĀ [11]:
df.isnull().sum()
Out[11]:
Gender 0 customer_type 0 age 0 type_of_travel 0 customer_class 0 flight_distance 0 inflight_wifi_service 0 departure_arrival_time_convenient 0 ease_of_online_booking 0 gate_location 0 food_and_drink 0 online_boarding 0 seat_comfort 0 inflight_entertainment 0 onboard_service 0 leg_room_service 0 baggage_handling 0 checkin_service 0 inflight_service 0 cleanliness 0 departure_delay_in_minutes 0 arrival_delay_in_minutes 393 satisfaction 0 dtype: int64
InĀ [12]:
sns.displot(df['arrival_delay_in_minutes'].dropna(),kde=False,bins=50)
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x1742a90d0d0>
The above histogram shows that the distribution of arrival_delay_in_minutes is skewed therefore median imputation is a good choice.
InĀ [13]:
df.isnull().sum
Out[13]:
<bound method DataFrame.sum of Gender customer_type age type_of_travel customer_class \
0 False False False False False
1 False False False False False
2 False False False False False
3 False False False False False
4 False False False False False
... ... ... ... ... ...
129875 False False False False False
129876 False False False False False
129877 False False False False False
129878 False False False False False
129879 False False False False False
flight_distance inflight_wifi_service \
0 False False
1 False False
2 False False
3 False False
4 False False
... ... ...
129875 False False
129876 False False
129877 False False
129878 False False
129879 False False
departure_arrival_time_convenient ease_of_online_booking \
0 False False
1 False False
2 False False
3 False False
4 False False
... ... ...
129875 False False
129876 False False
129877 False False
129878 False False
129879 False False
gate_location ... inflight_entertainment onboard_service \
0 False ... False False
1 False ... False False
2 False ... False False
3 False ... False False
4 False ... False False
... ... ... ... ...
129875 False ... False False
129876 False ... False False
129877 False ... False False
129878 False ... False False
129879 False ... False False
leg_room_service baggage_handling checkin_service inflight_service \
0 False False False False
1 False False False False
2 False False False False
3 False False False False
4 False False False False
... ... ... ... ...
129875 False False False False
129876 False False False False
129877 False False False False
129878 False False False False
129879 False False False False
cleanliness departure_delay_in_minutes arrival_delay_in_minutes \
0 False False False
1 False False False
2 False False False
3 False False False
4 False False False
... ... ... ...
129875 False False False
129876 False False False
129877 False False False
129878 False False False
129879 False False False
satisfaction
0 False
1 False
2 False
3 False
4 False
... ...
129875 False
129876 False
129877 False
129878 False
129879 False
[129880 rows x 23 columns]>
VISUAL DATA EXPLORATORY¶
InĀ [14]:
# Ensure the column names exist in your DataFrame
print(df.columns)
# selection of columns
numerical = df.drop(columns=["Gender", "customer_type", "type_of_travel", "customer_class", "satisfaction"])
categorical = df[["Gender", "customer_type", "type_of_travel", "customer_class", "satisfaction"]]
Index(['Gender', 'customer_type', 'age', 'type_of_travel', 'customer_class',
'flight_distance', 'inflight_wifi_service',
'departure_arrival_time_convenient', 'ease_of_online_booking',
'gate_location', 'food_and_drink', 'online_boarding', 'seat_comfort',
'inflight_entertainment', 'onboard_service', 'leg_room_service',
'baggage_handling', 'checkin_service', 'inflight_service',
'cleanliness', 'departure_delay_in_minutes', 'arrival_delay_in_minutes',
'satisfaction'],
dtype='object')
InĀ [17]:
sns.countplot(x='customer_type', data=categorical)
Out[17]:
<Axes: xlabel='customer_type', ylabel='count'>
InĀ [18]:
sns.countplot(x='customer_type', data=categorical)
Out[18]:
<Axes: xlabel='customer_type', ylabel='count'>
InĀ [19]:
sns.countplot(x='type_of_travel', data=categorical)
Out[19]:
<Axes: xlabel='type_of_travel', ylabel='count'>
InĀ [20]:
sns.countplot(x='customer_class', data=categorical)
Out[20]:
<Axes: xlabel='customer_class', ylabel='count'>
InĀ [21]:
df['customer_class'].value_counts()
Out[21]:
customer_class Business 62160 Eco 58309 Eco Plus 9411 Name: count, dtype: int64
InĀ [22]:
sns.countplot(x='satisfaction', data=categorical)
Out[22]:
<Axes: xlabel='satisfaction', ylabel='count'>
InĀ [23]:
for x in numerical:
sns.distplot(numerical, kde=True)
plt.show()
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3292991569.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(numerical, kde=True)
InĀ [24]:
sns.countplot(x='inflight_wifi_service',data=df)
Out[24]:
<Axes: xlabel='inflight_wifi_service', ylabel='count'>
InĀ [25]:
df['inflight_wifi_service'].value_counts()
Out[25]:
inflight_wifi_service 2 32320 3 32185 4 24775 1 22328 5 14356 0 3916 Name: count, dtype: int64
InĀ [26]:
sns.countplot(x='departure_arrival_time_convenient',data=df)
Out[26]:
<Axes: xlabel='departure_arrival_time_convenient', ylabel='count'>
InĀ [27]:
df['departure_arrival_time_convenient'].value_counts()
Out[27]:
departure_arrival_time_convenient 4 31880 5 27998 3 22378 2 21534 1 19409 0 6681 Name: count, dtype: int64
InĀ [28]:
sns.countplot(x='ease_of_online_booking',data=df)
Out[28]:
<Axes: xlabel='ease_of_online_booking', ylabel='count'>
InĀ [29]:
df['ease_of_online_booking'].value_counts()
Out[29]:
ease_of_online_booking 3 30393 2 30051 4 24444 1 21886 5 17424 0 5682 Name: count, dtype: int64
InĀ [30]:
sns.countplot(x='gate_location',data=df)
Out[30]:
<Axes: xlabel='gate_location', ylabel='count'>
InĀ [31]:
df['gate_location'].value_counts()
Out[31]:
gate_location 3 35717 4 30466 2 24296 1 21991 5 17409 0 1 Name: count, dtype: int64
InĀ [32]:
sns.countplot(x='food_and_drink',data=df)
Out[32]:
<Axes: xlabel='food_and_drink', ylabel='count'>
InĀ [33]:
df['food_and_drink'].value_counts()
Out[33]:
food_and_drink 4 30563 5 27957 3 27794 2 27383 1 16051 0 132 Name: count, dtype: int64
InĀ [34]:
sns.countplot(x='online_boarding',data=df)
Out[34]:
<Axes: xlabel='online_boarding', ylabel='count'>
InĀ [35]:
df['online_boarding'].value_counts()
Out[35]:
online_boarding 4 38468 3 27117 5 26020 2 21934 1 13261 0 3080 Name: count, dtype: int64
InĀ [36]:
sns.countplot(x='seat_comfort',data=df)
Out[36]:
<Axes: xlabel='seat_comfort', ylabel='count'>
InĀ [37]:
df['seat_comfort'].value_counts()
Out[37]:
seat_comfort 4 39756 5 33158 3 23328 2 18529 1 15108 0 1 Name: count, dtype: int64
InĀ [38]:
sns.countplot(x='inflight_entertainment',data=df)
Out[38]:
<Axes: xlabel='inflight_entertainment', ylabel='count'>
InĀ [39]:
df['inflight_entertainment'].value_counts()
Out[39]:
inflight_entertainment 4 36791 5 31544 3 23884 2 21968 1 15675 0 18 Name: count, dtype: int64
InĀ [40]:
sns.countplot(x='onboard_service',data=df)
Out[40]:
<Axes: xlabel='onboard_service', ylabel='count'>
InĀ [41]:
df['onboard_service'].value_counts()
Out[41]:
onboard_service 4 38703 5 29492 3 28542 2 18351 1 14787 0 5 Name: count, dtype: int64
InĀ [42]:
sns.countplot(x='leg_room_service',data=df)
Out[42]:
<Axes: xlabel='leg_room_service', ylabel='count'>
InĀ [43]:
df['leg_room_service'].value_counts()
Out[43]:
leg_room_service 4 35886 5 30905 3 25056 2 24540 1 12895 0 598 Name: count, dtype: int64
InĀ [44]:
sns.countplot(x='baggage_handling',data=df)
Out[44]:
<Axes: xlabel='baggage_handling', ylabel='count'>
InĀ [45]:
df['baggage_handling'].value_counts()
Out[45]:
baggage_handling 4 46761 5 33878 3 25851 2 14362 1 9028 Name: count, dtype: int64
InĀ [46]:
sns.countplot(x='checkin_service',data=df)
Out[46]:
<Axes: xlabel='checkin_service', ylabel='count'>
InĀ [47]:
df['checkin_service'].value_counts()
Out[47]:
checkin_service 4 36333 3 35453 5 25883 1 16108 2 16102 0 1 Name: count, dtype: int64
InĀ [48]:
sns.countplot(x='inflight_service',data=df)
Out[48]:
<Axes: xlabel='inflight_service', ylabel='count'>
InĀ [49]:
df['inflight_service'].value_counts()
Out[49]:
inflight_service 4 47323 5 34066 3 25316 2 14308 1 8862 0 5 Name: count, dtype: int64
InĀ [50]:
sns.countplot(x='cleanliness',data=df)
Out[50]:
<Axes: xlabel='cleanliness', ylabel='count'>
InĀ [51]:
df['cleanliness'].value_counts()
Out[51]:
cleanliness 4 33969 3 30639 5 28416 2 20113 1 16729 0 14 Name: count, dtype: int64
InĀ [52]:
sns.countplot(x='Gender',data=df,hue='satisfaction')
Out[52]:
<Axes: xlabel='Gender', ylabel='count'>
InĀ [53]:
sns.countplot(x='customer_type',data=df,hue='satisfaction')
Out[53]:
<Axes: xlabel='customer_type', ylabel='count'>
InĀ [54]:
sns.countplot(x='type_of_travel',data=df,hue='satisfaction')
Out[54]:
<Axes: xlabel='type_of_travel', ylabel='count'>
InĀ [55]:
sns.countplot(x='customer_class',data=df,hue='type_of_travel')
Out[55]:
<Axes: xlabel='customer_class', ylabel='count'>
InĀ [56]:
sns.countplot(x='customer_class',data=df,hue='satisfaction')
Out[56]:
<Axes: xlabel='customer_class', ylabel='count'>
InĀ [57]:
sns.countplot(x='satisfaction',data=df,hue='satisfaction')
Out[57]:
<Axes: xlabel='satisfaction', ylabel='count'>
InĀ [58]:
df['satisfaction'].value_counts()
Out[58]:
satisfaction neutral or dissatisfied 73452 satisfied 56428 Name: count, dtype: int64
InĀ [59]:
sns.countplot(x='inflight_wifi_service',data=df,hue='satisfaction')
Out[59]:
<Axes: xlabel='inflight_wifi_service', ylabel='count'>
InĀ [60]:
sns.countplot(x='departure_arrival_time_convenient',data=df,hue='satisfaction')
Out[60]:
<Axes: xlabel='departure_arrival_time_convenient', ylabel='count'>
InĀ [61]:
sns.countplot(x='ease_of_online_booking',data=df,hue='satisfaction')
Out[61]:
<Axes: xlabel='ease_of_online_booking', ylabel='count'>
InĀ [62]:
sns.countplot(x='gate_location',data=df,hue='satisfaction')
Out[62]:
<Axes: xlabel='gate_location', ylabel='count'>
InĀ [63]:
sns.countplot(x='food_and_drink',data=df,hue='satisfaction')
Out[63]:
<Axes: xlabel='food_and_drink', ylabel='count'>
InĀ [64]:
sns.countplot(x='online_boarding',data=df,hue='satisfaction')
Out[64]:
<Axes: xlabel='online_boarding', ylabel='count'>
InĀ [65]:
sns.countplot(x='seat_comfort',data=df,hue='satisfaction')
Out[65]:
<Axes: xlabel='seat_comfort', ylabel='count'>
InĀ [66]:
sns.countplot(x='inflight_entertainment',data=df,hue='satisfaction')
Out[66]:
<Axes: xlabel='inflight_entertainment', ylabel='count'>
InĀ [67]:
sns.countplot(x='onboard_service',data=df,hue='satisfaction')
Out[67]:
<Axes: xlabel='onboard_service', ylabel='count'>
InĀ [68]:
sns.countplot(x='leg_room_service',data=df,hue='satisfaction')
Out[68]:
<Axes: xlabel='leg_room_service', ylabel='count'>
InĀ [69]:
sns.countplot(x='baggage_handling',data=df,hue='satisfaction')
Out[69]:
<Axes: xlabel='baggage_handling', ylabel='count'>
InĀ [70]:
sns.countplot(x='checkin_service',data=df,hue='satisfaction')
Out[70]:
<Axes: xlabel='checkin_service', ylabel='count'>
InĀ [71]:
sns.countplot(x='inflight_service',data=df,hue='satisfaction')
Out[71]:
<Axes: xlabel='inflight_service', ylabel='count'>
InĀ [72]:
sns.countplot(x='cleanliness',data=df,hue='satisfaction')
Out[72]:
<Axes: xlabel='cleanliness', ylabel='count'>
InĀ [73]:
plt.subplots(figsize = (10,7))
sns.histplot(x='flight_distance',hue="satisfaction",data=df,kde=True)
plt.title("Flight Distance VS Satisfaction")
Out[73]:
Text(0.5, 1.0, 'Flight Distance VS Satisfaction')
InĀ [74]:
sns.pairplot(df)
Out[74]:
<seaborn.axisgrid.PairGrid at 0x174565861b0>
InĀ [75]:
sns.boxplot(x='satisfaction',y='inflight_wifi_service',data=df, hue='satisfaction')
Out[75]:
<Axes: xlabel='satisfaction', ylabel='inflight_wifi_service'>
InĀ [76]:
sns.boxplot(x='satisfaction',y='age',data=df, hue='satisfaction')
Out[76]:
<Axes: xlabel='satisfaction', ylabel='age'>
InĀ [77]:
sns.boxplot(x='satisfaction',y='departure_arrival_time_convenient',data=df, hue='satisfaction')
Out[77]:
<Axes: xlabel='satisfaction', ylabel='departure_arrival_time_convenient'>
InĀ [78]:
sns.boxplot(x='satisfaction',y='ease_of_online_booking',data=df, hue='satisfaction')
Out[78]:
<Axes: xlabel='satisfaction', ylabel='ease_of_online_booking'>
InĀ [79]:
sns.boxplot(x='satisfaction',y='gate_location',data=df, hue='satisfaction')
Out[79]:
<Axes: xlabel='satisfaction', ylabel='gate_location'>
InĀ [80]:
sns.boxplot(x='satisfaction',y='food_and_drink',data=df, hue='satisfaction')
Out[80]:
<Axes: xlabel='satisfaction', ylabel='food_and_drink'>
InĀ [81]:
sns.boxplot(x='satisfaction',y='online_boarding',data=df, hue='satisfaction')
Out[81]:
<Axes: xlabel='satisfaction', ylabel='online_boarding'>
InĀ [82]:
sns.boxplot(x='satisfaction',y='seat_comfort',data=df, hue='satisfaction')
Out[82]:
<Axes: xlabel='satisfaction', ylabel='seat_comfort'>
InĀ [83]:
sns.boxplot(x='satisfaction',y='inflight_entertainment',data=df, hue='satisfaction')
Out[83]:
<Axes: xlabel='satisfaction', ylabel='inflight_entertainment'>
InĀ [84]:
sns.boxplot(x='satisfaction',y='onboard_service',data=df, hue='satisfaction')
Out[84]:
<Axes: xlabel='satisfaction', ylabel='onboard_service'>
InĀ [85]:
sns.boxplot(x='satisfaction',y='leg_room_service',data=df, hue='satisfaction')
Out[85]:
<Axes: xlabel='satisfaction', ylabel='leg_room_service'>
InĀ [86]:
sns.boxplot(x='satisfaction',y='baggage_handling',data=df, hue='satisfaction')
Out[86]:
<Axes: xlabel='satisfaction', ylabel='baggage_handling'>
InĀ [87]:
sns.boxplot(x='satisfaction',y='checkin_service',data=df, hue='satisfaction')
Out[87]:
<Axes: xlabel='satisfaction', ylabel='checkin_service'>
InĀ [88]:
sns.boxplot(x='satisfaction',y='inflight_service',data=df, hue='satisfaction')
Out[88]:
<Axes: xlabel='satisfaction', ylabel='inflight_service'>
InĀ [89]:
sns.boxplot(x='satisfaction',y='cleanliness',data=df, hue='satisfaction')
Out[89]:
<Axes: xlabel='satisfaction', ylabel='cleanliness'>
InĀ [Ā ]:
import seaborn as sns
sns.boxplot(x='departure_delay_in_minutes', figsize =(10,6), data=df)
plt.show()
InĀ [3]:
sns.boxplot(x='arrival_delay_in_minutes', data=df)
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[3], line 1 ----> 1 sns.boxplot(x='arrival_delay_in_minutes', data=df) 2 plt.show() NameError: name 'df' is not defined
InĀ [92]:
def remove_outliers(column_name):
Q1 = df[column_name].quantile(0.25)
Q3 = df[column_name].quantile(0.75)
IQR = Q3 - Q1
Upper_boundary = Q3 + (1.5 * IQR)
Lower_boundary = Q1 - (1.5 * IQR)
# replace number > Upper_boundary and number < Lower_boundary with nan value
df.loc[(df[column_name] > Upper_boundary) | (df[column_name] < Lower_boundary), column_name] = np.nan
# replace nan value with mean
df[column_name].fillna(df[column_name].mode(), inplace = True)
InĀ [93]:
remove_outliers("inflight_service")
remove_outliers("baggage_handling")
remove_outliers("inflight_entertainment")
remove_outliers("seat_comfort")
remove_outliers("online_boarding")
remove_outliers("checkin_service")
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3950046972.py:16: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df[column_name].fillna(df[column_name].mode(), inplace = True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3950046972.py:16: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df[column_name].fillna(df[column_name].mode(), inplace = True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3950046972.py:16: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df[column_name].fillna(df[column_name].mode(), inplace = True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3950046972.py:16: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df[column_name].fillna(df[column_name].mode(), inplace = True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3950046972.py:16: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df[column_name].fillna(df[column_name].mode(), inplace = True)
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\3950046972.py:16: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df[column_name].fillna(df[column_name].mode(), inplace = True)
InĀ [94]:
# Ensure columns exist and then encode them
columns_to_encode = ['Gender', 'customer_type', 'type_of_travel', 'customer_class', 'satisfaction']
for column in columns_to_encode:
if column in df.columns:
df[column] = df[column].astype('category').cat.codes
else:
print(f"Column '{column}' not found in the DataFrame")
# Verify the changes
print(df.head())
Gender customer_type age type_of_travel customer_class \ 0 1 0 13 1 2 1 1 1 25 0 0 2 0 0 26 0 0 3 0 0 25 0 0 4 1 0 61 0 0 flight_distance inflight_wifi_service departure_arrival_time_convenient \ 0 460 3 4 1 235 3 2 2 1142 2 2 3 562 2 5 4 214 3 3 ease_of_online_booking gate_location ... inflight_entertainment \ 0 3 1 ... 5.0 1 3 3 ... 1.0 2 2 2 ... 5.0 3 5 5 ... 2.0 4 3 3 ... 3.0 onboard_service leg_room_service baggage_handling checkin_service \ 0 4 3 4.0 4.0 1 1 5 3.0 NaN 2 4 3 4.0 4.0 3 2 5 3.0 NaN 4 3 4 4.0 3.0 inflight_service cleanliness departure_delay_in_minutes \ 0 5.0 5 25 1 4.0 1 1 2 4.0 5 0 3 4.0 2 11 4 3.0 3 0 arrival_delay_in_minutes satisfaction 0 18.0 0 1 6.0 0 2 0.0 1 3 9.0 0 4 0.0 1 [5 rows x 23 columns]
InĀ [95]:
corr = df.corr()
InĀ [96]:
corr
Out[96]:
| Gender | customer_type | age | type_of_travel | customer_class | flight_distance | inflight_wifi_service | departure_arrival_time_convenient | ease_of_online_booking | gate_location | ... | inflight_entertainment | onboard_service | leg_room_service | baggage_handling | checkin_service | inflight_service | cleanliness | departure_delay_in_minutes | arrival_delay_in_minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Gender | 1.000000 | -0.030958 | 0.008996 | 0.009503 | -0.011574 | 0.003616 | 0.005901 | 0.008995 | 0.005893 | -0.000863 | ... | 0.003843 | 0.006447 | 0.031047 | 0.036356 | 0.010361 | 0.038504 | 0.002867 | 0.003491 | 0.001309 | 0.011236 |
| customer_type | -0.030958 | 1.000000 | -0.284172 | -0.308236 | 0.042994 | -0.226021 | -0.005757 | -0.206873 | -0.018059 | 0.004465 | ... | -0.106001 | -0.054172 | -0.046841 | 0.024874 | -0.024412 | 0.023292 | -0.081302 | 0.003859 | 0.004730 | -0.186017 |
| age | 0.008996 | -0.284172 | 1.000000 | -0.044808 | -0.116921 | 0.099459 | 0.016116 | 0.036960 | 0.022565 | -0.000398 | ... | 0.074947 | 0.057078 | 0.039119 | -0.047991 | 0.023708 | -0.051347 | 0.052565 | -0.009041 | -0.011248 | 0.134091 |
| type_of_travel | 0.009503 | -0.308236 | -0.044808 | 1.000000 | 0.486718 | -0.266792 | -0.105865 | 0.257102 | -0.134078 | -0.029869 | ... | -0.152936 | -0.059794 | -0.139612 | -0.033012 | 0.012815 | -0.023538 | -0.084615 | -0.005913 | -0.005830 | -0.449861 |
| customer_class | -0.011574 | 0.042994 | -0.116921 | 0.486718 | 1.000000 | -0.426925 | -0.024962 | 0.087152 | -0.095138 | -0.005701 | ... | -0.183162 | -0.210649 | -0.198825 | -0.166588 | -0.118912 | -0.159285 | -0.129623 | 0.009530 | 0.014162 | -0.448193 |
| flight_distance | 0.003616 | -0.226021 | 0.099459 | -0.266792 | -0.426925 | 1.000000 | 0.006701 | -0.018914 | 0.065165 | 0.005520 | ... | 0.130507 | 0.111194 | 0.134533 | 0.064855 | 0.055870 | 0.059316 | 0.095648 | 0.002402 | -0.001935 | 0.298085 |
| inflight_wifi_service | 0.005901 | -0.005757 | 0.016116 | -0.105865 | -0.024962 | 0.006701 | 1.000000 | 0.344915 | 0.714807 | 0.338573 | ... | 0.207802 | 0.119928 | 0.160317 | 0.120376 | 0.035005 | 0.110029 | 0.131300 | -0.015946 | -0.017749 | 0.283460 |
| departure_arrival_time_convenient | 0.008995 | -0.206873 | 0.036960 | 0.257102 | 0.087152 | -0.018914 | 0.344915 | 1.000000 | 0.437620 | 0.447510 | ... | -0.008380 | 0.067297 | 0.010617 | 0.070833 | 0.071985 | 0.072195 | 0.009862 | 0.000778 | -0.000942 | -0.054270 |
| ease_of_online_booking | 0.005893 | -0.018059 | 0.022565 | -0.134078 | -0.095138 | 0.065165 | 0.714807 | 0.437620 | 1.000000 | 0.460041 | ... | 0.046564 | 0.039064 | 0.109450 | 0.039148 | 0.002264 | 0.035373 | 0.015125 | -0.005318 | -0.007033 | 0.168877 |
| gate_location | -0.000863 | 0.004465 | -0.000398 | -0.029869 | -0.005701 | 0.005520 | 0.338573 | 0.447510 | 0.460041 | 1.000000 | ... | 0.002741 | -0.029019 | -0.005181 | 0.000972 | -0.034097 | 0.000310 | -0.005918 | 0.005973 | 0.005658 | -0.002793 |
| food_and_drink | 0.001730 | -0.056997 | 0.023194 | -0.068986 | -0.080773 | 0.057066 | 0.132214 | 0.000687 | 0.030514 | -0.002872 | ... | 0.623461 | 0.057404 | 0.033173 | 0.035321 | 0.070585 | 0.035210 | 0.658054 | -0.029164 | -0.031715 | 0.211340 |
| online_boarding | -0.045022 | -0.189083 | 0.207572 | -0.224020 | -0.297645 | 0.214825 | 0.457445 | 0.072287 | 0.404866 | 0.002756 | ... | 0.283922 | 0.154242 | 0.123225 | 0.083541 | 0.163077 | 0.074058 | 0.329377 | -0.019404 | -0.022730 | 0.501749 |
| seat_comfort | -0.030756 | -0.156239 | 0.159136 | -0.127717 | -0.212334 | 0.157662 | 0.121513 | 0.008666 | 0.028561 | 0.002788 | ... | 0.611837 | 0.130545 | 0.104272 | 0.074620 | 0.153768 | 0.068842 | 0.679613 | -0.027999 | -0.030521 | 0.348829 |
| inflight_entertainment | 0.003843 | -0.106001 | 0.074947 | -0.152936 | -0.183162 | 0.130507 | 0.207802 | -0.008380 | 0.046564 | 0.002741 | ... | 1.000000 | 0.418574 | 0.300397 | 0.379123 | 0.101803 | 0.406094 | 0.692511 | -0.027012 | -0.030230 | 0.398234 |
| onboard_service | 0.006447 | -0.054172 | 0.057078 | -0.059794 | -0.210649 | 0.111194 | 0.119928 | 0.067297 | 0.039064 | -0.029019 | ... | 0.418574 | 1.000000 | 0.357721 | 0.520296 | 0.201324 | 0.551569 | 0.122084 | -0.030486 | -0.034789 | 0.322205 |
| leg_room_service | 0.031047 | -0.046841 | 0.039119 | -0.139612 | -0.198825 | 0.134533 | 0.160317 | 0.010617 | 0.109450 | -0.005181 | ... | 0.300397 | 0.357721 | 1.000000 | 0.371455 | 0.126902 | 0.369569 | 0.096695 | 0.014574 | 0.011346 | 0.312424 |
| baggage_handling | 0.036356 | 0.024874 | -0.047991 | -0.033012 | -0.166588 | 0.064855 | 0.120376 | 0.070833 | 0.039148 | 0.000972 | ... | 0.379123 | 0.520296 | 0.371455 | 1.000000 | 0.188843 | 0.629237 | 0.097071 | -0.004105 | -0.007935 | 0.248680 |
| checkin_service | 0.010361 | -0.024412 | 0.023708 | 0.012815 | -0.118912 | 0.055870 | 0.035005 | 0.071985 | 0.002264 | -0.034097 | ... | 0.101803 | 0.201324 | 0.126902 | 0.188843 | 1.000000 | 0.190977 | 0.146582 | -0.013459 | -0.016034 | 0.198172 |
| inflight_service | 0.038504 | 0.023292 | -0.051347 | -0.023538 | -0.159285 | 0.059316 | 0.110029 | 0.072195 | 0.035373 | 0.000310 | ... | 0.406094 | 0.551569 | 0.369569 | 0.629237 | 0.190977 | 1.000000 | 0.090356 | -0.054432 | -0.059853 | 0.244918 |
| cleanliness | 0.002867 | -0.081302 | 0.052565 | -0.084615 | -0.129623 | 0.095648 | 0.131300 | 0.009862 | 0.015125 | -0.005918 | ... | 0.692511 | 0.122084 | 0.096695 | 0.097071 | 0.146582 | 0.090356 | 1.000000 | -0.014543 | -0.016546 | 0.307035 |
| departure_delay_in_minutes | 0.003491 | 0.003859 | -0.009041 | -0.005913 | 0.009530 | 0.002402 | -0.015946 | 0.000778 | -0.005318 | 0.005973 | ... | -0.027012 | -0.030486 | 0.014574 | -0.004105 | -0.013459 | -0.054432 | -0.014543 | 1.000000 | 0.965291 | -0.050740 |
| arrival_delay_in_minutes | 0.001309 | 0.004730 | -0.011248 | -0.005830 | 0.014162 | -0.001935 | -0.017749 | -0.000942 | -0.007033 | 0.005658 | ... | -0.030230 | -0.034789 | 0.011346 | -0.007935 | -0.016034 | -0.059853 | -0.016546 | 0.965291 | 1.000000 | -0.058275 |
| satisfaction | 0.011236 | -0.186017 | 0.134091 | -0.449861 | -0.448193 | 0.298085 | 0.283460 | -0.054270 | 0.168877 | -0.002793 | ... | 0.398234 | 0.322205 | 0.312424 | 0.248680 | 0.198172 | 0.244918 | 0.307035 | -0.050740 | -0.058275 | 1.000000 |
23 rows Ć 23 columns
InĀ [97]:
plt.figure(figsize=(30,5))
sns.heatmap(df.select_dtypes(exclude = object).corr(), fmt = ".2f", linewidths = 0.2, annot=True,cmap='coolwarm',center=0)
plt.show()
MODELLING¶
InĀ [98]:
#importing required models
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestRegressor
InĀ [99]:
#checking multicollinearity
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Separate features and target variable
x = df.drop(columns=['satisfaction'])
y = df['satisfaction'].values
# Check for missing or infinite values in the features
if x.isnull().values.any() or np.isinf(x).values.any():
# Option 2: Impute missing values (example: fill with mean)
x = x.fillna(x.mean()).replace([np.inf, -np.inf], np.nan).fillna(x.mean())
# Add a constant to the independent variables
X = sm.add_constant(x)
# Calculate VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)
feature VIF 0 const 60.406934 1 Gender 1.011189 2 customer_type 1.514992 3 age 1.161430 4 type_of_travel 1.831717 5 customer_class 1.680157 6 flight_distance 1.344833 7 inflight_wifi_service 2.442128 8 departure_arrival_time_convenient 1.663603 9 ease_of_online_booking 2.694192 10 gate_location 1.507225 11 food_and_drink 2.173924 12 online_boarding 1.983909 13 seat_comfort 2.382917 14 inflight_entertainment 3.814593 15 onboard_service 1.759608 16 leg_room_service 1.315638 17 baggage_handling 1.900141 18 checkin_service 1.112991 19 inflight_service 2.064726 20 cleanliness 2.846353 21 departure_delay_in_minutes 12.867672 22 arrival_delay_in_minutes 12.878252
InĀ [100]:
df= df.drop(columns=['arrival_delay_in_minutes'],axis=1)
df
Out[100]:
| Gender | customer_type | age | type_of_travel | customer_class | flight_distance | inflight_wifi_service | departure_arrival_time_convenient | ease_of_online_booking | gate_location | ... | seat_comfort | inflight_entertainment | onboard_service | leg_room_service | baggage_handling | checkin_service | inflight_service | cleanliness | departure_delay_in_minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 13 | 1 | 2 | 460 | 3 | 4 | 3 | 1 | ... | 5.0 | 5.0 | 4 | 3 | 4.0 | 4.0 | 5.0 | 5 | 25 | 0 |
| 1 | 1 | 1 | 25 | 0 | 0 | 235 | 3 | 2 | 3 | 3 | ... | 1.0 | 1.0 | 1 | 5 | 3.0 | NaN | 4.0 | 1 | 1 | 0 |
| 2 | 0 | 0 | 26 | 0 | 0 | 1142 | 2 | 2 | 2 | 2 | ... | 5.0 | 5.0 | 4 | 3 | 4.0 | 4.0 | 4.0 | 5 | 0 | 1 |
| 3 | 0 | 0 | 25 | 0 | 0 | 562 | 2 | 5 | 5 | 5 | ... | 2.0 | 2.0 | 2 | 5 | 3.0 | NaN | 4.0 | 2 | 11 | 0 |
| 4 | 1 | 0 | 61 | 0 | 0 | 214 | 3 | 3 | 3 | 3 | ... | 5.0 | 3.0 | 3 | 4 | 4.0 | 3.0 | 3.0 | 3 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 129875 | 1 | 1 | 34 | 0 | 0 | 526 | 3 | 3 | 3 | 1 | ... | 4.0 | 4.0 | 3 | 2 | 4.0 | 4.0 | 5.0 | 4 | 0 | 0 |
| 129876 | 1 | 0 | 23 | 0 | 0 | 646 | 4 | 4 | 4 | 4 | ... | 4.0 | 4.0 | 4 | 5 | 5.0 | 5.0 | 5.0 | 4 | 0 | 1 |
| 129877 | 0 | 0 | 17 | 1 | 1 | 828 | 2 | 5 | 1 | 5 | ... | 2.0 | 2.0 | 4 | 3 | 4.0 | 5.0 | 4.0 | 2 | 0 | 0 |
| 129878 | 1 | 0 | 14 | 0 | 0 | 1127 | 3 | 3 | 3 | 3 | ... | 4.0 | 4.0 | 3 | 2 | 5.0 | 4.0 | 5.0 | 4 | 0 | 1 |
| 129879 | 0 | 0 | 42 | 1 | 1 | 264 | 2 | 5 | 2 | 5 | ... | 2.0 | 1.0 | 1 | 2 | 1.0 | NaN | 1.0 | 1 | 0 | 0 |
129880 rows Ć 22 columns
InĀ [101]:
df.shape
Out[101]:
(129880, 22)
InĀ [102]:
#Chceking VIF again
# Separate features and target variable
x = df.drop(columns=['satisfaction'])
y = df['satisfaction'].values
# Check for missing or infinite values in the features
if x.isnull().values.any() or np.isinf(x).values.any():
# Option 1: Drop rows with missing or infinite values
#x = x.dropna().replace([np.inf, -np.inf], np.nan).dropna()
# Option 2: Impute missing values (example: fill with mean)
x = x.fillna(x.mean()).replace([np.inf, -np.inf], np.nan).fillna(x.mean())
# Add a constant to the independent variables
X = sm.add_constant(x)
# Calculate VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)
feature VIF 0 const 60.346700 1 Gender 1.011144 2 customer_type 1.514927 3 age 1.161353 4 type_of_travel 1.831506 5 customer_class 1.679981 6 flight_distance 1.344703 7 inflight_wifi_service 2.442117 8 departure_arrival_time_convenient 1.663601 9 ease_of_online_booking 2.694190 10 gate_location 1.507222 11 food_and_drink 2.173681 12 online_boarding 1.983884 13 seat_comfort 2.382893 14 inflight_entertainment 3.814509 15 onboard_service 1.759570 16 leg_room_service 1.315632 17 baggage_handling 1.900097 18 checkin_service 1.112987 19 inflight_service 2.063934 20 cleanliness 2.846334 21 departure_delay_in_minutes 1.007577
InĀ [103]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Feature Scaling¶
InĀ [104]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
ML MODELS BEFORE RESAMPLING¶
InĀ [105]:
# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
Out[105]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
InĀ [106]:
# Make predictions
y_pred = model.predict(X_test_scaled)
InĀ [107]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Confusion Matrix:
[[13200 1422]
[ 1864 9490]]
Classification Report:
precision recall f1-score support
0 0.88 0.90 0.89 14622
1 0.87 0.84 0.85 11354
accuracy 0.87 25976
macro avg 0.87 0.87 0.87 25976
weighted avg 0.87 0.87 0.87 25976
InĀ [108]:
from sklearn.neighbors import KNeighborsClassifier
InĀ [109]:
knn= KNeighborsClassifier()
InĀ [110]:
knn.fit(X_train_scaled,y_train)
Out[110]:
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
InĀ [111]:
y_pred_knn=knn.predict(X_test_scaled)
InĀ [112]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_knn)
Out[112]:
0.9291268863566369
InĀ [113]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_knn))
precision recall f1-score support
0 0.91 0.97 0.94 14622
1 0.95 0.88 0.92 11354
accuracy 0.93 25976
macro avg 0.93 0.92 0.93 25976
weighted avg 0.93 0.93 0.93 25976
InĀ [114]:
from sklearn.tree import DecisionTreeClassifier
InĀ [115]:
dt= DecisionTreeClassifier()
dt.fit(X_train_scaled,y_train)
Out[115]:
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
InĀ [116]:
y_pred_dt=dt.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred_dt))
0.9455651370495842
InĀ [117]:
print(classification_report(y_test,y_pred_dt))
precision recall f1-score support
0 0.95 0.95 0.95 14622
1 0.94 0.94 0.94 11354
accuracy 0.95 25976
macro avg 0.94 0.94 0.94 25976
weighted avg 0.95 0.95 0.95 25976
InĀ [118]:
from sklearn.ensemble import RandomForestClassifier
ran_forest=RandomForestClassifier()
ran_forest.fit(X_train_scaled,y_train)
Out[118]:
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
InĀ [119]:
y_pred_forest=ran_forest.predict(X_test_scaled)
print(accuracy_score(y_test,y_pred_forest))
0.962157376039421
InĀ [120]:
print(classification_report(y_test,y_pred_forest))
precision recall f1-score support
0 0.95 0.98 0.97 14622
1 0.98 0.94 0.96 11354
accuracy 0.96 25976
macro avg 0.96 0.96 0.96 25976
weighted avg 0.96 0.96 0.96 25976
InĀ [121]:
from sklearn.svm import SVC
svc=SVC()
svc.fit(X_train_scaled,y_train)
Out[121]:
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
InĀ [122]:
y_pred_svc=svc.predict(X_test_scaled)
print(accuracy_score(y_test,y_pred_svc))
0.9541499846011703
InĀ [123]:
print(classification_report(y_test,y_pred_svc))
precision recall f1-score support
0 0.95 0.97 0.96 14622
1 0.96 0.93 0.95 11354
accuracy 0.95 25976
macro avg 0.95 0.95 0.95 25976
weighted avg 0.95 0.95 0.95 25976
ML MODELS AFTER RESAMPLING¶
InĀ [124]:
pip install imblearn
Requirement already satisfied: imblearn in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (0.0)Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: imbalanced-learn in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from imblearn) (0.12.3) Requirement already satisfied: numpy>=1.17.3 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from imbalanced-learn->imblearn) (2.0.0) Requirement already satisfied: scipy>=1.5.0 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from imbalanced-learn->imblearn) (1.14.0) Requirement already satisfied: scikit-learn>=1.0.2 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from imbalanced-learn->imblearn) (1.5.1) Requirement already satisfied: joblib>=1.1.1 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from imbalanced-learn->imblearn) (1.4.2) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\ramasish chatterjee\appdata\local\programs\python\python312\lib\site-packages (from imbalanced-learn->imblearn) (3.5.0)
InĀ [125]:
from imblearn.over_sampling import SMOTE
InĀ [126]:
sm=SMOTE(random_state=42)
x_res,y_res=sm.fit_resample(X_train_scaled,y_train)
InĀ [127]:
knn.fit(x_res,y_res)
y_pred_knn_res=knn.predict(X_test_scaled)
accuracy_score(y_test, y_pred_knn_res)
print(classification_report(y_test,y_pred_knn_res))
precision recall f1-score support
0 0.92 0.95 0.94 14622
1 0.94 0.90 0.92 11354
accuracy 0.93 25976
macro avg 0.93 0.93 0.93 25976
weighted avg 0.93 0.93 0.93 25976
InĀ [128]:
dt.fit(x_res,y_res)
y_pred_dt_res=dt.predict(X_test_scaled)
accuracy_score(y_test, y_pred_dt_res)
print(classification_report(y_test,y_pred_dt_res))
precision recall f1-score support
0 0.95 0.95 0.95 14622
1 0.94 0.94 0.94 11354
accuracy 0.94 25976
macro avg 0.94 0.94 0.94 25976
weighted avg 0.94 0.94 0.94 25976
InĀ [129]:
ran_forest.fit(x_res,y_res)
y_pred_ran_forest_res=ran_forest.predict(X_test_scaled)
accuracy_score(y_test, y_pred_ran_forest_res)
print(classification_report(y_test,y_pred_ran_forest_res))
precision recall f1-score support
0 0.95 0.98 0.97 14622
1 0.97 0.94 0.96 11354
accuracy 0.96 25976
macro avg 0.96 0.96 0.96 25976
weighted avg 0.96 0.96 0.96 25976
InĀ [130]:
svc.fit(x_res,y_res)
y_pred_svc_res=svc.predict(X_test_scaled)
accuracy_score(y_test, y_pred_svc_res)
print(classification_report(y_test,y_pred_svc_res))
precision recall f1-score support
0 0.95 0.96 0.96 14622
1 0.95 0.94 0.95 11354
accuracy 0.95 25976
macro avg 0.95 0.95 0.95 25976
weighted avg 0.95 0.95 0.95 25976
HYPERPARAMETER TUNING¶
InĀ [131]:
from sklearn.model_selection import GridSearchCV
KNN¶
InĀ [132]:
knn_params={"n_neighbors":[1,10,30,50],"weights":["uniform","distance"]}
knn_grid=GridSearchCV(knn,knn_params,cv=5)
knn_grid.fit(x_res,y_res)
C:\Users\Ramasish Chatterjee\AppData\Local\Programs\Python\Python312\Lib\site-packages\numpy\ma\core.py:2846: RuntimeWarning: invalid value encountered in cast _data = np.array(data, dtype=dtype, copy=copy,
Out[132]:
GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
param_grid={'n_neighbors': [1, 10, 30, 50],
'weights': ['uniform', 'distance']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
param_grid={'n_neighbors': [1, 10, 30, 50],
'weights': ['uniform', 'distance']})KNeighborsClassifier(n_neighbors=10, weights='distance')
KNeighborsClassifier(n_neighbors=10, weights='distance')
InĀ [133]:
knn_result=pd.DataFrame(knn_grid.cv_results_)
best_parameters_knn=knn_grid.best_params_
print(best_parameters_knn,knn_grid.best_score_)
{'n_neighbors': 10, 'weights': 'distance'} 0.9416624171341151
Decision Tree¶
InĀ [134]:
dt_params={'criterion':['gini','entropy'],'max_depth':[1,3,5,7,10,None]}
dt_grid=GridSearchCV(dt,dt_params,cv=5)
dt_grid.fit(x_res,y_res)
Out[134]:
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [1, 3, 5, 7, 10, None]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [1, 3, 5, 7, 10, None]})DecisionTreeClassifier(criterion='entropy')
DecisionTreeClassifier(criterion='entropy')
InĀ [135]:
dt_result=pd.DataFrame(dt_grid.cv_results_)
best_parameters_dt=dt_grid.best_params_
print(best_parameters_dt,dt_grid.best_score_)
{'criterion': 'entropy', 'max_depth': None} 0.9519377868434471
Random Forest¶
InĀ [136]:
ran_forest_params={"criterion":['gini','entropy'],"max_depth":[1,2,5,7,None]}
ran_forest_grid=GridSearchCV(ran_forest,ran_forest_params,cv=5)
ran_forest_grid.fit(x_res,y_res)
Out[136]:
GridSearchCV(cv=5, estimator=RandomForestClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [1, 2, 5, 7, None]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=RandomForestClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [1, 2, 5, 7, None]})RandomForestClassifier()
RandomForestClassifier()
InĀ [137]:
ran_forest_result=pd.DataFrame(ran_forest_grid.cv_results_)
best_parameters_ran_forest=ran_forest_grid.best_params_
print(best_parameters_ran_forest,ran_forest_grid.best_score_)
{'criterion': 'gini', 'max_depth': None} 0.9662417134115246
FINAL MODELS¶
InĀ [138]:
from sklearn.metrics import roc_curve
InĀ [139]:
knn1= KNeighborsClassifier(n_neighbors=10,weights='distance')
InĀ [140]:
clf1=knn1.fit(x_res,y_res)
y_pred_knn1=knn1.predict(X_test_scaled)
accuracy_score(y_test, y_pred_knn1)
print(classification_report(y_test,y_pred_knn1))
precision recall f1-score support
0 0.92 0.96 0.94 14622
1 0.94 0.90 0.92 11354
accuracy 0.93 25976
macro avg 0.93 0.93 0.93 25976
weighted avg 0.93 0.93 0.93 25976
InĀ [141]:
dtf=DecisionTreeClassifier(criterion="entropy", max_depth=None)
dtf.fit(x_res,y_res)
y_pred_dtf=dtf.predict(X_test_scaled)
accuracy_score(y_test, y_pred_dtf)
print(classification_report(y_test,y_pred_dtf))
precision recall f1-score support
0 0.95 0.95 0.95 14622
1 0.94 0.94 0.94 11354
accuracy 0.95 25976
macro avg 0.95 0.95 0.95 25976
weighted avg 0.95 0.95 0.95 25976
InĀ [142]:
ran_forestf=RandomForestClassifier(criterion="entropy",max_depth=None)
ran_forestf.fit(x_res,y_res)
y_pred_ran_forest_f=ran_forest.predict(X_test_scaled)
accuracy_score(y_test, y_pred_ran_forest_f)
print(classification_report(y_test,y_pred_ran_forest_f))
precision recall f1-score support
0 0.95 0.98 0.97 14622
1 0.97 0.94 0.96 11354
accuracy 0.96 25976
macro avg 0.96 0.96 0.96 25976
weighted avg 0.96 0.96 0.96 25976
Feature Importance¶
InĀ [143]:
# Get the feature importances
importances = ran_forestf.feature_importances_
# Convert X_train_scaled back to DataFrame to get column names
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
# Map the feature importances to their corresponding feature names
feature_importance_df = pd.DataFrame({
'Feature': X_train_scaled_df.columns,
'Importance': importances
}).sort_values(by='Importance', ascending=False)
# Display the feature importances
print(feature_importance_df)
# Plot the feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importances from Random Forest Classifier')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
Feature Importance 7 inflight_wifi_service 0.159751 12 online_boarding 0.150886 4 type_of_travel 0.094083 5 customer_class 0.089571 14 inflight_entertainment 0.050554 13 seat_comfort 0.046087 9 ease_of_online_booking 0.046067 16 leg_room_service 0.041740 2 customer_type 0.040793 6 flight_distance 0.038738 3 age 0.037743 15 onboard_service 0.030564 17 baggage_handling 0.030010 20 cleanliness 0.026325 19 inflight_service 0.026037 18 checkin_service 0.022429 10 gate_location 0.018200 8 departure_arrival_time_convenient 0.017686 21 departure_delay_in_minutes 0.014955 11 food_and_drink 0.012665 1 Gender 0.005117 0 const 0.000000
C:\Users\Ramasish Chatterjee\AppData\Local\Temp\ipykernel_1784\1183606931.py:18: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
InĀ [144]:
df.shape
Out[144]:
(129880, 22)